Download course materials from here.
bit.ly/Leeds_2019-02-28
install.packages(tidyverse,ordinal,lme4)
Dataset
sleep:
head(sleep)
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
What does this do?
sleep[1,] # what's this do? (row 1)
## extra group ID
## 1 0.7 1 1
How does this differ from [1,]?
sleep[2,] # so that means this is row 2
## extra group ID
## 2 -1.6 1 2
How does this differ from [3,]?
sleep[,3] # (column 3)
## [1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
## Levels: 1 2 3 4 5 6 7 8 9 10
…which makes it dataset[row,column]
You can also navigate with column names:
sleep$ID
## [1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
## Levels: 1 2 3 4 5 6 7 8 9 10
How would you view the column extra?
sleep$extra
## [1] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0 1.9 0.8 1.1 0.1
## [15] -0.1 4.4 5.5 1.6 4.6 3.4
Use str() to get a summary of the structure of the dataset
str(sleep)
## 'data.frame': 20 obs. of 3 variables:
## $ extra: num 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
## $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ID : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
What are all the unique values in ID?
unique(sleep$extra)
## [1] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0.0 2.0 1.9 1.1 0.1 4.4
## [15] 5.5 1.6 4.6
What’s the value in the first row, third column?
sleep[1,3]
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9 10
What’s the first element in the column ID?
sleep$ID[1]
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9 10
You can also view the dataset as a spreadsheet (although it can’t be altered).
View(sleep)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
A tibble is different from a table.
as.tibble(sleep)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
## # A tibble: 20 x 3
## extra group ID
## <dbl> <fct> <fct>
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
## 7 3.7 1 7
## 8 0.8 1 8
## 9 0 1 9
## 10 2 1 10
## 11 1.9 2 1
## 12 0.8 2 2
## 13 1.1 2 3
## 14 0.1 2 4
## 15 -0.1 2 5
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
Pipes are like toy funnels
%>%
sleep %>%
head()
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
# or
sleep %>%
head
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
How many attestations of each type of group?
sleep %>%
count(group)
## # A tibble: 2 x 2
## group n
## <fct> <int>
## 1 1 10
## 2 2 10
How can you make a new column? Duplicate group into group2:
sleep %>%
mutate(group2 = group)
## extra group ID group2
## 1 0.7 1 1 1
## 2 -1.6 1 2 1
## 3 -0.2 1 3 1
## 4 -1.2 1 4 1
## 5 -0.1 1 5 1
## 6 3.4 1 6 1
## 7 3.7 1 7 1
## 8 0.8 1 8 1
## 9 0.0 1 9 1
## 10 2.0 1 10 1
## 11 1.9 2 1 2
## 12 0.8 2 2 2
## 13 1.1 2 3 2
## 14 0.1 2 4 2
## 15 -0.1 2 5 2
## 16 4.4 2 6 2
## 17 5.5 2 7 2
## 18 1.6 2 8 2
## 19 4.6 2 9 2
## 20 3.4 2 10 2
Let’s rename the levels in group to be “one” and “two”:
sleep %>%
mutate(group2 = case_when(group==1 ~ "one",
TRUE ~ "two"))
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 two
## 12 0.8 2 2 two
## 13 1.1 2 3 two
## 14 0.1 2 4 two
## 15 -0.1 2 5 two
## 16 4.4 2 6 two
## 17 5.5 2 7 two
## 18 1.6 2 8 two
## 19 4.6 2 9 two
## 20 3.4 2 10 two
What’s wrong with this one?
sleep %>%
mutate(group2 = case_when(group==1 ~ as.factor("one"),
group==2 ~ as.factor("two")))
## Warning in `[<-.factor`(`*tmp*`, i, value = structure(1L, .Label = "two",
## class = "factor")): invalid factor level, NA generated
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 <NA>
## 12 0.8 2 2 <NA>
## 13 1.1 2 3 <NA>
## 14 0.1 2 4 <NA>
## 15 -0.1 2 5 <NA>
## 16 4.4 2 6 <NA>
## 17 5.5 2 7 <NA>
## 18 1.6 2 8 <NA>
## 19 4.6 2 9 <NA>
## 20 3.4 2 10 <NA>
sleep %>%
mutate(group2 = case_when(group==1 ~"one",
group==2 ~"two")) %>%
mutate(group2 = as.factor(group2))
## extra group ID group2
## 1 0.7 1 1 one
## 2 -1.6 1 2 one
## 3 -0.2 1 3 one
## 4 -1.2 1 4 one
## 5 -0.1 1 5 one
## 6 3.4 1 6 one
## 7 3.7 1 7 one
## 8 0.8 1 8 one
## 9 0.0 1 9 one
## 10 2.0 1 10 one
## 11 1.9 2 1 two
## 12 0.8 2 2 two
## 13 1.1 2 3 two
## 14 0.1 2 4 two
## 15 -0.1 2 5 two
## 16 4.4 2 6 two
## 17 5.5 2 7 two
## 18 1.6 2 8 two
## 19 4.6 2 9 two
## 20 3.4 2 10 two
Now, let’s recreate the count function with group_by and summarise:
sleep %>%
group_by(group) %>%
summarise(n = n())
## # A tibble: 2 x 2
## group n
## <fct> <int>
## 1 1 10
## 2 2 10
We cal use group_by and summarise to do a lot more than just count:
# mean value of `extra` by `group2`
sleep %>%
mutate(group2 = case_when(group==1 ~"one",
group==2 ~"two")) %>%
mutate(group2 = as.factor(group2)) %>%
group_by(group) %>%
summarise(effectMean = mean(extra))
## # A tibble: 2 x 2
## group effectMean
## <fct> <dbl>
## 1 1 0.75
## 2 2 2.33
Dataset
quakes:
What does quakes look like?
quakes %>%
str
## 'data.frame': 1000 obs. of 5 variables:
## $ lat : num -20.4 -20.6 -26 -18 -20.4 ...
## $ long : num 182 181 184 182 182 ...
## $ depth : int 562 650 42 626 649 195 82 194 211 622 ...
## $ mag : num 4.8 4.2 5.4 4.1 4 4 4.8 4.4 4.7 4.3 ...
## $ stations: int 41 15 43 19 11 12 43 15 35 19 ...
How many observations are there per “level” of magnitude?
quakes %>%
group_by(mag) %>%
summarise(numberOfObvs = n())
## # A tibble: 22 x 2
## mag numberOfObvs
## <dbl> <int>
## 1 4 46
## 2 4.1 55
## 3 4.2 90
## 4 4.3 85
## 5 4.4 101
## 6 4.5 107
## 7 4.6 101
## 8 4.7 98
## 9 4.8 65
## 10 4.9 54
## # … with 12 more rows
Let’s create a table of the means, standard deviations, and standard errors for both stations reporting and depths:
quakes %>%
group_by(mag) %>%
summarise(numberOfObvs = n(),
stationMean = mean(stations),
stationSD = sd(stations),
stationSE = sd(stations)/sqrt(numberOfObvs),
depthMean = mean(depth),
depthSD = sd(depth),
depthSE = sd(depth)/sqrt(numberOfObvs))
## # A tibble: 22 x 8
## mag numberOfObvs stationMean stationSD stationSE depthMean depthSD
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 46 14.9 4.25 0.626 410. 170.
## 2 4.1 55 15.7 4.39 0.591 412. 201.
## 3 4.2 90 18.4 5.54 0.584 390. 186.
## 4 4.3 85 19.3 5.94 0.644 358. 204.
## 5 4.4 101 22.3 8.06 0.802 307. 205.
## 6 4.5 107 24.4 8.31 0.803 334. 214.
## 7 4.6 101 27.3 9.67 0.962 331. 224.
## 8 4.7 98 31.2 9.11 0.921 238. 204.
## 9 4.8 65 36.8 11.2 1.38 229. 199.
## 10 4.9 54 42.9 13.3 1.80 248. 228.
## # … with 12 more rows, and 1 more variable: depthSE <dbl>
This dataset is wide. Let’s make it long using gather (compare to spread):
quakes %>%
group_by(mag) %>%
summarise(numberOfObvs = n(),
stationMean = mean(stations),
stationSD = sd(stations),
stationSE = sd(stations)/sqrt(numberOfObvs),
depthMean = mean(depth),
depthSD = sd(depth),
depthSE = sd(depth)/sqrt(numberOfObvs)) %>%
gather(measurement, values, 3:8)
## # A tibble: 132 x 4
## mag numberOfObvs measurement values
## <dbl> <int> <chr> <dbl>
## 1 4 46 stationMean 14.9
## 2 4.1 55 stationMean 15.7
## 3 4.2 90 stationMean 18.4
## 4 4.3 85 stationMean 19.3
## 5 4.4 101 stationMean 22.3
## 6 4.5 107 stationMean 24.4
## 7 4.6 101 stationMean 27.3
## 8 4.7 98 stationMean 31.2
## 9 4.8 65 stationMean 36.8
## 10 4.9 54 stationMean 42.9
## # … with 122 more rows
# https://bookdown.org/ndphillips/YaRrr/pirateplot.html
library(tidyverse)
# install.packages("ggbeeswarm")
library(ggbeeswarm)
Dataset
iris:
The function ggplot layers different geometries and aesthetics to build up a plot:
iris %>%
mutate(Sepal.Width = Sepal.Width+rnorm(length(Sepal.Width),0,.1))%>%
ggplot(aes(x=Species,y=Sepal.Width,fill=Species)) +
geom_violin(lty=0,alpha=.5)+
geom_boxplot(alpha=0.5,lwd=.5) +
geom_quasirandom(dodge.width=1, alpha=.5)
In what ways might we change this plot?
iris %>%
mutate(
Sepal.Width = Sepal.Width +
rnorm(
length(Sepal.Width) , 0 , .1
)
) %>%
ggplot(aes(x=Species,y=Sepal.Width)) +
geom_boxplot() +
geom_quasirandom(aes(pch=Species, colour=Species), alpha=.5, cex=3,) +
theme_bw()
Going back to quakes, let’s pipe our table into a ggplot (and fill in missing values):
quakes %>%
group_by(mag) %>%
summarise(n=n(),
sta=mean(stations),
staSD=sd(stations),
staSE=staSD/sqrt(n),
dep=mean(depth),
depSD=sd(depth),
depSE=depSD/sqrt(n)) %>%
ggplot(aes(x=mag)) +
geom_point(aes(y=sta),colour="red")+
geom_path(aes(y=sta),colour="red")+
geom_ribbon(aes(ymin=sta-staSD,ymax=sta+staSD),fill="red",alpha=.2) +
geom_ribbon(aes(ymin=sta-staSE,ymax=sta+staSE),fill="red",alpha=.5) +
geom_point(aes(y=dep),colour="blue")+
geom_path(aes(y=dep),colour="blue")+
geom_ribbon(aes(ymin=dep-depSD,ymax=dep+depSD),fill="blue",alpha=.2) +
geom_ribbon(aes(ymin=dep-depSE,ymax=dep+depSE),fill="blue",alpha=.5) +
theme_bw() + ylab("depth OR number of stations reporting")
See Simulated Data notebook
data <- read_csv("../data/simulated-data.csv")
## Parsed with column specification:
## cols(
## subj = col_double(),
## age = col_double(),
## item = col_double(),
## freq = col_character(),
## gram = col_character(),
## rating = col_double(),
## accuracy = col_double(),
## region = col_double(),
## word = col_character(),
## rt = col_double()
## )
Plot boxplots and violin plots for the ratings. Subset by participant.
data %>%
filter(region == 1) %>%
ggplot(aes(x=gram, y=rating, fill=freq)) +
geom_boxplot(position = position_dodge(.9)) +
facet_wrap(~subj)
Group by region, word, frequency, and grammaticality. Summarise mean and standard error.
data %>%
group_by(region, word, freq, gram) %>%
summarise(meanRT = mean(rt),
seRT = sd(rt)/sqrt(n())) %>%
ggplot(aes(x=region,y=meanRT,colour=gram)) +
geom_point() +
geom_path(aes(lty=freq)) +
geom_errorbar(aes(ymin=meanRT-seRT, ymax=meanRT+seRT),width=.1) +
scale_x_continuous(breaks=c(1:5),labels = c("the","old","VERB","the","boat")) +
ylab("reading time (miliseconds)")
Well, check out the Datasaurus Dozen, which all have the same x/y means and standard deviations:
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
Build a simple linear model to examine region 3 (the verb):
# summary(
# lm ( DV ~ IV1 * IV2 + IV3 , data = data )
# )
summary(lm(rt ~ freq * gram + age,
data[data$region==3,]))
##
## Call:
## lm(formula = rt ~ freq * gram + age, data = data[data$region ==
## 3, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -266.663 -56.801 3.386 52.437 271.658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 321.2832 12.1092 26.532 < 2e-16 ***
## freqlow 31.2062 8.0033 3.899 0.000105 ***
## gramyes -1.9600 8.0033 -0.245 0.806600
## age 1.2848 0.2503 5.134 3.58e-07 ***
## freqlow:gramyes 1.5559 11.3184 0.137 0.890697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.03 on 795 degrees of freedom
## Multiple R-squared: 0.06839, Adjusted R-squared: 0.0637
## F-statistic: 14.59 on 4 and 795 DF, p-value: 1.663e-11
Add in mixed effects for a linear mixed effects model:
#summary(
# lmer( DV ~ IV1 * IV2 + IV3 + ( 1 | RV1 ) + ( 1 | RV2 ) , data = data)
# )
summary(lmer(rt ~ freq+gram+freq:gram+age + (1|subj) + (1|item),
data %>% filter(region==3)))
## Linear mixed model fit by REML ['lmerMod']
## Formula: rt ~ freq + gram + freq:gram + age + (1 | subj) + (1 | item)
## Data: data %>% filter(region == 3)
##
## REML criterion at convergence: 9254.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1961 -0.7094 0.0332 0.6754 3.3830
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 129.944 11.399
## item (Intercept) 8.753 2.958
## Residual 6273.993 79.209
## Number of obs: 800, groups: subj, 40; item, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 321.2832 13.9689 23.000
## freqlow 31.2062 8.1389 3.834
## gramyes -1.9600 8.1389 -0.241
## age 1.2848 0.2946 4.362
## freqlow:gramyes 1.5559 11.5101 0.135
##
## Correlation of Fixed Effects:
## (Intr) freqlw gramys age
## freqlow -0.291
## gramyes -0.291 0.500
## age -0.902 0.000 0.000
## frqlw:grmys 0.206 -0.707 -0.707 0.000
Bodo Winter telling it like it is. (Photo credit: Adam Schembri 27/02/2019)
We should do model comparison to assess the contribution of each of the factors to the overall fit. But, read the Bates et al and Barr et al papers for an overview of the debates around how to design and test models.
Let’s do model comparison for region 3:
mdl1.max <- lmer(rt ~ freq*gram + age + accuracy + (1|subj),data[data$region==3,])
mdl1.age <- lmer(rt ~ freq*gram + accuracy + (1|subj),data[data$region==3,])
mdl1.acc <- lmer(rt ~ freq*gram + age + (1|subj),data[data$region==3,])
mdl1.int <- lmer(rt ~ freq+gram + age + accuracy + (1|subj),data[data$region==3,])
mdl1.frq <- lmer(rt ~ gram + age + accuracy + (1|subj),data[data$region==3,])
mdl1.grm <- lmer(rt ~ freq + age + accuracy + (1|subj),data[data$region==3,])
anova(mdl1.max,mdl1.age) # max vs age
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 3, ]
## Models:
## mdl1.age: rt ~ freq * gram + accuracy + (1 | subj)
## mdl1.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl1.age 7 9305.5 9338.3 -4645.7 9291.5
## mdl1.max 8 9291.3 9328.8 -4637.7 9275.3 16.152 1 5.846e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mdl1.max,mdl1.acc) # max vs acc
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 3, ]
## Models:
## mdl1.acc: rt ~ freq * gram + age + (1 | subj)
## mdl1.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl1.acc 7 9289.3 9322.1 -4637.7 9275.3
## mdl1.max 8 9291.3 9328.8 -4637.7 9275.3 0.0064 1 0.9362
anova(mdl1.max,mdl1.int) # max vs int
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 3, ]
## Models:
## mdl1.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## mdl1.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl1.int 7 9289.4 9322.1 -4637.7 9275.4
## mdl1.max 8 9291.3 9328.8 -4637.7 9275.3 0.0214 1 0.8836
anova(mdl1.int,mdl1.frq) # int vs frq
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 3, ]
## Models:
## mdl1.frq: rt ~ gram + age + accuracy + (1 | subj)
## mdl1.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl1.frq 6 9319.3 9347.5 -4653.7 9307.3
## mdl1.int 7 9289.4 9322.1 -4637.7 9275.4 31.991 1 1.549e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mdl1.int,mdl1.grm) # int vs grm
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 3, ]
## Models:
## mdl1.grm: rt ~ freq + age + accuracy + (1 | subj)
## mdl1.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl1.grm 6 9287.4 9315.5 -4637.7 9275.4
## mdl1.int 7 9289.4 9322.1 -4637.7 9275.4 0.0451 1 0.8318
How do regions 4 and 5 compare?:
mdl2.max <- lmer(rt ~ freq*gram + age + accuracy + (1|subj),data[data$region==4,])
mdl2.age <- lmer(rt ~ freq*gram + accuracy + (1|subj),data[data$region==4,])
mdl2.acc <- lmer(rt ~ freq*gram + age + (1|subj),data[data$region==4,])
## singular fit
mdl2.int <- lmer(rt ~ freq+gram + age + accuracy + (1|subj),data[data$region==4,])
mdl2.frq <- lmer(rt ~ gram + age + accuracy + (1|subj),data[data$region==4,])
mdl2.grm <- lmer(rt ~ freq + age + accuracy + (1|subj),data[data$region==4,])
## singular fit
anova(mdl2.max,mdl2.age) # max vs age
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 4, ]
## Models:
## mdl2.age: rt ~ freq * gram + accuracy + (1 | subj)
## mdl2.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl2.age 7 9677.9 9710.7 -4831.9 9663.9
## mdl2.max 8 9667.6 9705.1 -4825.8 9651.6 12.255 1 0.0004641 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mdl2.max,mdl2.acc) # max vs acc
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 4, ]
## Models:
## mdl2.acc: rt ~ freq * gram + age + (1 | subj)
## mdl2.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl2.acc 7 9666.2 9699.0 -4826.1 9652.2
## mdl2.max 8 9667.6 9705.1 -4825.8 9651.6 0.6272 1 0.4284
anova(mdl2.max,mdl2.int) # max vs int
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 4, ]
## Models:
## mdl2.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## mdl2.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl2.int 7 9666.3 9699.1 -4826.1 9652.3
## mdl2.max 8 9667.6 9705.1 -4825.8 9651.6 0.6526 1 0.4192
anova(mdl2.int,mdl2.frq) # int vs frq
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 4, ]
## Models:
## mdl2.frq: rt ~ gram + age + accuracy + (1 | subj)
## mdl2.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl2.frq 6 9665.8 9693.9 -4826.9 9653.8
## mdl2.int 7 9666.3 9699.1 -4826.1 9652.3 1.522 1 0.2173
anova(mdl2.int,mdl2.grm) # int vs grm
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 4, ]
## Models:
## mdl2.grm: rt ~ freq + age + accuracy + (1 | subj)
## mdl2.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl2.grm 6 9671.5 9699.6 -4829.7 9659.5
## mdl2.int 7 9666.3 9699.1 -4826.1 9652.3 7.2132 1 0.007237 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl3.max <- lmer(rt ~ freq*gram + age + accuracy + (1|subj),data[data$region==5,])
mdl3.age <- lmer(rt ~ freq*gram + accuracy + (1|subj),data[data$region==5,])
mdl3.acc <- lmer(rt ~ freq*gram + age + (1|subj),data[data$region==5,])
mdl3.int <- lmer(rt ~ freq+gram + age + accuracy + (1|subj),data[data$region==5,])
mdl3.frq <- lmer(rt ~ gram + age + accuracy + (1|subj),data[data$region==5,])
mdl3.grm <- lmer(rt ~ freq + age + accuracy + (1|subj),data[data$region==5,])
anova(mdl3.max,mdl3.age) # max vs age
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 5, ]
## Models:
## mdl3.age: rt ~ freq * gram + accuracy + (1 | subj)
## mdl3.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl3.age 7 10352 10385 -5169.2 10338
## mdl3.max 8 10352 10389 -5167.9 10336 2.6005 1 0.1068
anova(mdl3.max,mdl3.acc) # max vs acc
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 5, ]
## Models:
## mdl3.acc: rt ~ freq * gram + age + (1 | subj)
## mdl3.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl3.acc 7 10350 10383 -5168.1 10336
## mdl3.max 8 10352 10389 -5167.9 10336 0.3072 1 0.5794
anova(mdl3.max,mdl3.int) # max vs int
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 5, ]
## Models:
## mdl3.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## mdl3.max: rt ~ freq * gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl3.int 7 10350 10383 -5168.0 10336
## mdl3.max 8 10352 10389 -5167.9 10336 0.1517 1 0.6969
anova(mdl3.int,mdl3.frq) # int vs frq
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 5, ]
## Models:
## mdl3.frq: rt ~ gram + age + accuracy + (1 | subj)
## mdl3.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl3.frq 6 10348 10376 -5168 10336
## mdl3.int 7 10350 10383 -5168 10336 0.0287 1 0.8655
anova(mdl3.int,mdl3.grm) # int vs grm
## refitting model(s) with ML (instead of REML)
## Data: data[data$region == 5, ]
## Models:
## mdl3.grm: rt ~ freq + age + accuracy + (1 | subj)
## mdl3.int: rt ~ freq + gram + age + accuracy + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl3.grm 6 10361 10389 -5174.5 10349
## mdl3.int 7 10350 10383 -5168.0 10336 12.97 1 0.0003165 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
First, how could we go about using lmer for rating data?
mdl.ord <- lmer(rating ~ freq*gram + (1|subj), data%>%filter(region==1))
summary(mdl.ord)
## Linear mixed model fit by REML ['lmerMod']
## Formula: rating ~ freq * gram + (1 | subj)
## Data: data %>% filter(region == 1)
##
## REML criterion at convergence: 2376.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.04800 -0.83352 -0.04503 0.75865 2.94849
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 0.01176 0.1085
## Residual 1.11860 1.0576
## Number of obs: 800, groups: subj, 40
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.39500 0.07673 31.215
## freqlow -0.47000 0.10576 -4.444
## gramyes 1.82000 0.10576 17.208
## freqlow:gramyes 0.32000 0.14957 2.139
##
## Correlation of Fixed Effects:
## (Intr) freqlw gramys
## freqlow -0.689
## gramyes -0.689 0.500
## frqlw:grmys 0.487 -0.707 -0.707
library(ordinal)
##
## Attaching package: 'ordinal'
## The following objects are masked from 'package:lme4':
##
## ranef, VarCorr
## The following object is masked from 'package:dplyr':
##
## slice
For the ratings, build models like above, but using clmm() (these take a little longer to run):
mdl.ord.clmm <- clmm(as.factor(rating) ~ freq*gram + age + (1|subj), data%>%filter(region==1))
summary(mdl.ord.clmm)
## Cumulative Link Mixed Model fitted with the Laplace approximation
##
## formula: as.factor(rating) ~ freq * gram + age + (1 | subj)
## data: data %>% filter(region == 1)
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 800 -1048.20 2114.41 536(1078) 9.25e-05 2.8e+05
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 0.02675 0.1636
## Number of groups: subj 40
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## freqlow -0.928986 0.186594 -4.979 6.4e-07 ***
## gramyes 2.846015 0.207608 13.709 < 2e-16 ***
## age -0.007218 0.006135 -1.177 0.2394
## freqlow:gramyes 0.674074 0.264725 2.546 0.0109 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 1|2 -1.35904 0.30108 -4.514
## 2|3 -0.06295 0.29158 -0.216
## 3|4 1.14857 0.29870 3.845
## 4|5 2.52700 0.31329 8.066
See visualising_ordinal_data.R for Predicted Curves script
data %>%
filter(region==1) %>%
mutate(age_group = case_when(age<35~"young",
age>=35&age<=55~"middle",
age>55~"old")) %>%
mutate(age_group = factor(age_group,levels=c("young","middle","old")))%>%
group_by(freq,gram,age_group) %>%
summarise(accuracy=sum(accuracy)/n()) %>%
ggplot(aes(x=freq,fill=gram,y=accuracy))+
geom_bar(stat="identity",position="dodge")+
facet_wrap(~age_group)
Does accuracy change as a function of age?
# summary(
# glmer( DV ~ IV1 * IV2 + IV3 + ( 1 | RV1 ) + ( 1 | RV2 ), family="binomial", data = data)
# )
summary(glmer(accuracy ~ age + (1|subj), family="binomial", data=data[data$region==1,]))
## singular fit
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: accuracy ~ age + (1 | subj)
## Data: data[data$region == 1, ]
##
## AIC BIC logLik deviance df.resid
## 516.8 530.8 -255.4 510.8 797
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4618 0.2444 0.3034 0.3787 0.5932
##
## Random effects:
## Groups Name Variance Std.Dev.
## subj (Intercept) 0 0
## Number of obs: 800, groups: subj, 40
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.11587 0.49760 8.271 < 2e-16 ***
## age -0.04326 0.01025 -4.220 2.44e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.971
## convergence code: 0
## singular fit
Do model comparison to assess the contribution of each of the factors to the overall fit:
mdl4.max <- glmer(accuracy ~ freq*gram + age + (1|subj),data[data$region==1,], family="binomial")
mdl4.age <- glmer(accuracy ~ freq*gram + (1|subj),data[data$region==1,], family="binomial")
mdl4.int <- glmer(accuracy ~ freq+gram + age + (1|subj),data[data$region==1,], family="binomial")
## singular fit
mdl4.frq <- glmer(accuracy ~ gram + age + (1|subj),data[data$region==1,], family="binomial")
## singular fit
mdl4.grm <- glmer(accuracy ~ freq + age + (1|subj),data[data$region==1,], family="binomial")
## singular fit
anova(mdl4.max,mdl4.age) # max vs age
## Data: data[data$region == 1, ]
## Models:
## mdl4.age: accuracy ~ freq * gram + (1 | subj)
## mdl4.max: accuracy ~ freq * gram + age + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl4.age 5 525.76 549.18 -257.88 515.76
## mdl4.max 6 512.57 540.67 -250.28 500.57 15.193 1 9.707e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mdl4.max,mdl4.int) # max vs int
## Data: data[data$region == 1, ]
## Models:
## mdl4.int: accuracy ~ freq + gram + age + (1 | subj)
## mdl4.max: accuracy ~ freq * gram + age + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl4.int 5 519.15 542.58 -254.58 509.15
## mdl4.max 6 512.57 540.67 -250.28 500.57 8.5857 1 0.003388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mdl4.int,mdl4.frq) # int vs frq
## Data: data[data$region == 1, ]
## Models:
## mdl4.frq: accuracy ~ gram + age + (1 | subj)
## mdl4.int: accuracy ~ freq + gram + age + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl4.frq 4 518.55 537.29 -255.27 510.55
## mdl4.int 5 519.15 542.58 -254.58 509.15 1.3962 1 0.2374
anova(mdl4.int,mdl4.grm) # int vs grm
## Data: data[data$region == 1, ]
## Models:
## mdl4.grm: accuracy ~ freq + age + (1 | subj)
## mdl4.int: accuracy ~ freq + gram + age + (1 | subj)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mdl4.grm 4 517.38 536.11 -254.69 509.38
## mdl4.int 5 519.15 542.58 -254.58 509.15 0.2234 1 0.6365